//===========================================================================
/*!
 * 
 *
 * \brief       Traits of gpu expressions
 *
 * \author      O. Krause
 * \date        2016
 *
 *
 * \par Copyright 1995-2015 Shark Development Team
 * 
 * <BR><HR>
 * This file is part of Shark.
 * <http://image.diku.dk/shark/>
 * 
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published 
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
//===========================================================================

#ifndef REMORA_GPU_TRAITS_HPP
#define REMORA_GPU_TRAITS_HPP

#include <boost/compute/command_queue.hpp>
#include <boost/compute/core.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/functional/operator.hpp>
#include <boost/compute/functional.hpp>

namespace remora{namespace gpu{
	
template<class T, class Tag>
struct dense_vector_storage{
	typedef Tag storage_tag;
	
	boost::compute::buffer buffer;
	std::size_t offset;
	std::size_t stride;
	
	dense_vector_storage(){}
	dense_vector_storage(boost::compute::buffer const& buffer, std::size_t offset, std::size_t stride)
	:buffer(buffer), offset(offset), stride(stride){}
	template<class U, class Tag2>
	dense_vector_storage(dense_vector_storage<U, Tag2> const& storage):
	buffer(storage.buffer), offset(storage.offset), stride(storage.stride){
		static_assert(std::is_convertible<U&, T&>::value, "incompatible storage");
		static_assert(!(std::is_same<Tag,continuous_dense_tag>::value && std::is_same<Tag2,dense_tag>::value), "Trying to assign dense to continuous dense storage");
	}
	
	dense_vector_storage<T,Tag> sub_region(std::size_t offset) const{
		return {buffer, this->offset+offset * stride, stride};
	}
};

template<class T, class Tag>
struct dense_matrix_storage{
	typedef Tag storage_tag;
	template<class O>
	struct row_storage: public std::conditional<
		std::is_same<O,row_major>::value,
		dense_vector_storage<T, Tag>,
		dense_vector_storage<T, dense_tag>
	>{};
	template<class O>
	struct rows_storage: public std::conditional<
		std::is_same<O,row_major>::value,
		dense_matrix_storage<T, Tag>,
		dense_matrix_storage<T, dense_tag>
	>{};
	
	typedef dense_vector_storage<T,Tag> diag_storage;
	typedef dense_matrix_storage<T,dense_tag> sub_region_storage;
	
	boost::compute::buffer buffer;
	std::size_t offset;
	std::size_t leading_dimension;
	
	dense_matrix_storage(){}
	dense_matrix_storage(boost::compute::buffer const& buffer, std::size_t offset, std::size_t leading_dimension)
	:buffer(buffer), offset(offset), leading_dimension(leading_dimension){}
	
	template<class U, class Tag2>
	dense_matrix_storage(dense_matrix_storage<U, Tag2> const& storage):
	buffer(storage.buffer), offset(storage.offset), leading_dimension(storage.leading_dimension){
		static_assert(std::is_convertible<U&, T&>::value, "incompatible storage");
		static_assert(!(std::is_same<Tag,continuous_dense_tag>::value && std::is_same<Tag2,dense_tag>::value), "Trying to assign dense to continuous dense storage");
	}
	
	template<class Orientation>
	sub_region_storage sub_region(std::size_t offset1, std::size_t offset2, Orientation) const{
		std::size_t offset_major = Orientation::index_M(offset1,offset2);
		std::size_t offset_minor = Orientation::index_m(offset1,offset2);
		return {buffer, offset + offset_major*leading_dimension+offset_minor, leading_dimension};
	}
	
	template<class Orientation>
	typename row_storage<Orientation>::type row(std::size_t i, Orientation) const{
		return {buffer, offset + i * Orientation::index_M(leading_dimension,std::size_t(1)), Orientation::index_m(leading_dimension,std::size_t(1))};
	}
	
	template<class Orientation>
	typename rows_storage<Orientation>::type sub_rows(std::size_t i, Orientation) const{
		std::size_t stride = Orientation::index_M(leading_dimension,(std::size_t)1);
		return {buffer,offset + i * stride, leading_dimension};
	}
	
	diag_storage diag(){
		return {buffer, offset, leading_dimension+1};
	}
	
	dense_vector_storage<T, continuous_dense_tag> linear() const{
		return {buffer, offset, 1};
	}
};


//Expression objects and generated by the functors which are then turned into code by meta_kernel operator<<
//Note that often the type of the stored scalar can be different from its actual type, this is to allow replacing the value
//by a variable representing it (i.e. a kernel argument). This way we prevent hard coding variables in the generated source code
//in case the variable is indeed not constant
namespace detail{
template<class T, class Stored = T>
struct invoked_constant{
	Stored m_value;
};
	
template<class Arg1, class T, char Op, class Stored>
struct invoked_operator_scalar{
	typedef T result_type;
	Arg1 arg1;
	Stored m_scalar;
};

template<class Arg1, class T, class Stored = T>
struct invoked_add_scalar{
	typedef T result_type;
	Arg1 arg1;
	Stored m_scalar;
};

template<class Arg1, class Arg2, class T, class Stored=T>
struct invoked_multiply_and_add{
	typedef T result_type;
	Arg1 arg1;
	Arg2 arg2;
	Stored m_scalar;
};

template<class Arg1, class T>
struct invoked_soft_plus{
	typedef T result_type;
	Arg1 arg1;
};
template<class Arg1, class T>
struct invoked_sigmoid{
	typedef T result_type;
	Arg1 arg1;
};

template<class Arg1, class T>
struct invoked_sqr{
	typedef T result_type;
	Arg1 arg1;
};

template<class Arg1, class T>
struct invoked_inv{
	typedef T result_type;
	Arg1 arg1;
};

template<class Arg1, class Arg2, class T, class S>
struct invoked_safe_div{
	typedef T result_type;
	Arg1 arg1;
	Arg2 arg2;
	S default_value;
};


template<class Arg1, class T, char Op, class S>
boost::compute::detail::meta_kernel& operator<<(boost::compute::detail::meta_kernel& k, invoked_operator_scalar<Arg1,T, Op, S> const& e){
	return k << '('<<e.arg1 << Op << e.m_scalar<<')';
}
template<class Arg1, class Arg2, class T, class S>
boost::compute::detail::meta_kernel& operator<<(boost::compute::detail::meta_kernel& k, invoked_multiply_and_add<Arg1,Arg2,T, S> const& e){
	return k << '('<<e.arg1<<'+'<<e.m_scalar << '*'<< e.arg2<<')';
}
template<class Arg1, class T>
boost::compute::detail::meta_kernel& operator<<(boost::compute::detail::meta_kernel& k, invoked_soft_plus<Arg1,T> const& e){
	return k << "(log(1+exp("<< e.arg1<<")))";
}
template<class Arg1, class T>
boost::compute::detail::meta_kernel& operator<<(boost::compute::detail::meta_kernel& k, invoked_sigmoid<Arg1,T> const& e){
	return k << "(1/(1+exp(-"<< e.arg1<<")))";
}
template<class Arg1, class T>
boost::compute::detail::meta_kernel& operator<<(boost::compute::detail::meta_kernel& k, invoked_sqr<Arg1,T> const& e){
	return k << '('<<e.arg1<<'*'<<e.arg1<<')';
}
template<class Arg1, class T>
boost::compute::detail::meta_kernel& operator<<(boost::compute::detail::meta_kernel& k, invoked_inv<Arg1,T> const& e){
	return k << "1/("<<e.arg1<<')';
}

template<class T, class S>
boost::compute::detail::meta_kernel& operator<<(boost::compute::detail::meta_kernel& k, invoked_constant<T, S> const& e){
	return k << e.m_value;
}


template<class Arg1, class Arg2, class T, class S>
boost::compute::detail::meta_kernel& operator<<(boost::compute::detail::meta_kernel& k, invoked_safe_div<Arg1,Arg2,T, S> const& e){
	return k << "(("<<e.arg2<<"!=0)?"<<e.arg1<<'/'<<e.arg2<<':'<<e.default_value<<')';
}

}//End namespace detail
}//End namespace gpu

template<>
struct device_traits<gpu_tag>{
	typedef boost::compute::command_queue queue_type;
	
	static queue_type& default_queue(){
		return boost::compute::system::default_queue();
	}
	
	// iterators
	
	template <class Iterator, class Functor>
	struct transform_iterator{
		typedef no_iterator type;
	};
	
	template <class Iterator1, class Iterator2, class Functor>
	struct binary_transform_iterator{
		typedef no_iterator type;
	};
	
	template<class T>
	struct constant_iterator{
		typedef no_iterator type;
	};
	
	template<class T>
	struct one_hot_iterator{
		typedef no_iterator type;
	};
	
	template<class Closure>
	struct indexed_iterator{
		typedef no_iterator type;
	};
	
	//functional
	
	//G(F(args))
	template<class F, class G>
	struct compose{
		typedef typename G::result_type result_type;
		compose(F const& f, G const& g): m_f(f), m_g(g){ }
		
		template<class Arg1>
		auto operator()( Arg1 const& x) const -> decltype(std::declval<G const&>()(std::declval<F const&>()(x))){
			return m_g(m_f(x));
		}
		template<class Arg1, class Arg2>
		auto operator()( Arg1 const& x, Arg2 const& y) const -> decltype(std::declval<G const&>()(std::declval<F const&>()(x,y))){
			return m_g(m_f(x,y));
		}
		
		F m_f;
		G m_g;
	};
	
	//G(F1(args),F2(args))
	template<class F1, class F2, class G>
	struct compose_binary{
		typedef typename G::result_type result_type;
		compose_binary(F1 const& f1, F2 const& f2, G const& g): m_f1(f1), m_f2(f2), m_g(g){ }
		
		template<class Arg1>
		auto operator()( Arg1 const& x) const -> decltype(std::declval<G const&>()(std::declval<F1 const&>()(x),std::declval<F2 const&>()(x))){
			return m_g(m_f1(x), m_f2(x));
		}
		template<class Arg1, class Arg2>
		auto operator()( Arg1 const& x, Arg2 const& y) const -> decltype(std::declval<G const&>()(std::declval<F1 const&>()(x,y),std::declval<F2 const&>()(x,y))){
			return m_g(m_f1(x,y), m_f2(x,y));
		}
		
		F1 m_f1;
		F2 m_f2;
		G m_g;
	};
	
	
	//G(F1(arg1),F2(arg2))
	template<class F1, class F2, class G>
	struct transform_arguments{
		typedef typename G::result_type result_type;
		transform_arguments(F1 const& f1, F2 const& f2, G const& g): m_f1(f1), m_f2(f2), m_g(g){ }
		
		template<class Arg1, class Arg2>
		auto operator()( Arg1 const& x, Arg2 const& y) const -> decltype(std::declval<G const&>()(std::declval<F1 const&>()(x),std::declval<F2 const&>()(y))){
			return m_g(m_f1(x),m_f2(y));
		}
		
		F1 m_f1;
		F2 m_f2;
		G m_g;
	};
	
	template<class F, class Arg2>
	struct bind_second{
		typedef typename F::result_type result_type;
		bind_second(F const& f, Arg2 const& arg2) : m_function(f), m_arg2(arg2){ }
		
		template<class Arg1>
		auto operator()(Arg1 const& arg1) const -> decltype(std::declval<F const&>()(arg1,std::declval<Arg2 const&>()))
		{
			return m_function(arg1, m_arg2);
		}
		
		F m_function;
		Arg2 m_arg2;
	};
	
	
	//helper functions
	template<class F, class G>
	static compose<F,G> make_compose(F const& f, G const&g){
		return compose<F,G>(f,g);
	}
	
	template<class F1, class F2, class G>
	static compose_binary<F1, F2, G> make_compose_binary(F1 const& f1, F2 const& f2, G const&g){
		return compose_binary<F1, F2, G>(f1, f2, g);
	}
	
	template<class F1, class F2, class G>
	static transform_arguments<F1, F2, G> make_transform_arguments(F1 const& f1, F2 const& f2, G const& g){
		return transform_arguments<F1, F2, G>(f1, f2, g);
	}
	
	template<class F, class Arg2>
	static bind_second<F,Arg2> make_bind_second(F const& f, Arg2 const& arg2){
		return bind_second<F,Arg2>(f,arg2);
	}
	
	
	//functors
	
	//basic arithmetic
	template<class T>
	using add = boost::compute::plus<T>;
	template<class T>
	using subtract = boost::compute::minus<T>;
	template<class T>
	using multiply = boost::compute::multiplies<T>;
	template<class T>
	using divide = boost::compute::divides<T>;
	template<class T>
	using modulo = boost::compute::modulus<T>;
	template<class T>
	using pow = boost::compute::pow<T>;
	template<class T, class S=T>
	struct safe_divide{
		typedef T result_type;
		safe_divide(S const& default_value) : default_value(default_value) { }
		
		template<class Arg1, class Arg2>
		gpu::detail::invoked_safe_div<Arg1,Arg2, T,S> operator()(const Arg1 &x, const Arg2& y) const
		{
			return {x,y,default_value};
		}
		S default_value;
	};
	template<class T, class S= T>
	struct multiply_and_add{
		typedef T result_type;
		multiply_and_add(S const& scalar) :m_scalar(scalar) { }
		
		template<class Arg1, class Arg2>
		gpu::detail::invoked_multiply_and_add<Arg1,Arg2,T,S> operator()(const Arg1 &x, const Arg2& y) const
		{
			return {x,y, m_scalar};
		}
		S m_scalar;
	};
	
	
	template<class T, char Op, class S>
	struct operator_scalar{
		typedef T result_type;
		operator_scalar(S const& scalar) : m_scalar(scalar) { }
		
		template<class Arg1>
		gpu::detail::invoked_operator_scalar<Arg1,T, Op, S> operator()(Arg1 const& x) const
		{
			return {x, m_scalar};
		}
		S m_scalar;
	};
	
	template<class T>
	using multiply_scalar = operator_scalar<T, '*', T>;
	template<class T>
	using add_scalar = operator_scalar<T, '+', T>;
	template<class T>
	using divide_scalar = operator_scalar<T, '/', T>;
	template<class T>
	using modulo_scalar = operator_scalar<T, '%', T>;
	
	template<class T, class S=T>
	struct multiply_assign{
		typedef T result_type;
		multiply_assign(S const& scalar): m_scalar(scalar) { }
		
		template<class Arg1, class Arg2>
		gpu::detail::invoked_operator_scalar<Arg2,T,'*',S> operator()(const Arg1&, const Arg2& y) const
		{
			return {y, m_scalar};
		}
		S m_scalar;
	};
	template<class T>
	struct identity{
		typedef T result_type;
		
		template<class Arg>
		Arg const& operator()(Arg const& arg) const{
			return arg;
		}
	};
	
	template<class T>
	struct left_arg{
		typedef T result_type;
		
		template<class Arg1, class Arg2>
		Arg1 const& operator()(Arg1 const& arg1, Arg2 const&) const{
			return arg1;
		}
	};
	template<class T>
	struct right_arg{
		typedef T result_type;
		
		template<class Arg1, class Arg2>
		Arg2 const& operator()(Arg1 const&, Arg2 const& arg2) const{
			return arg2;
		}
	};
	
	template<class T, class S=T>
	struct constant{
		typedef T result_type;
		constant(S const& value): m_value(value){}
		
		template<class Arg>
		gpu::detail::invoked_constant<T,S> operator()(Arg const&) const
		{
			return {m_value};
		}
		template<class Arg1, class Arg2>
		gpu::detail::invoked_constant<T,S> operator()(Arg1 const&, Arg2 const&) const
		{
			return {m_value};
		}
		
		S m_value;
	};
	
	
	//math unary functions
	template<class T>
	using log = boost::compute::log<T>;
	template<class T>
	using exp = boost::compute::exp<T>;
	template<class T>
	using sin = boost::compute::sin<T>;
	template<class T>
	using cos = boost::compute::cos<T>;
	template<class T>
	using tan = boost::compute::tan<T>;
	template<class T>
	using asin = boost::compute::asin<T>;
	template<class T>
	using acos = boost::compute::acos<T>;
	template<class T>
	using atan = boost::compute::atan<T>;
	template<class T>
	using tanh = boost::compute::tanh<T>;
	template<class T>
	using sqrt = boost::compute::sqrt<T>;
	template<class T>
	using cbrt = boost::compute::cbrt<T>;
	template<class T>
	using abs = boost::compute::fabs<T>;
	
	template<class T>
	using erf = boost::compute::erf<T>;
	template<class T>
	using erfc = boost::compute::erfc<T>;
	
	template<class T>
	struct sqr{
		typedef T result_type;
		
		template<class Arg1>
		gpu::detail::invoked_sqr<Arg1,T> operator()(const Arg1 &x) const{
			return {x};
		}
	};
	template<class T>
	struct soft_plus{
		typedef T result_type;
		
		template<class Arg1>
		gpu::detail::invoked_soft_plus<Arg1,T> operator()(const Arg1 &x) const{
			return {x};
		}
	};
	template<class T>
	struct sigmoid{
		typedef T result_type;
		
		template<class Arg1>
		gpu::detail::invoked_sigmoid<Arg1,T> operator()(const Arg1 &x) const{
			return {x};
		}
	};
	template<class T>
	struct inv{
		typedef T result_type;
		
		template<class Arg1>
		gpu::detail::invoked_inv<Arg1,T> operator()(const Arg1 &x) const{
			return {x};
		}
	};
	
	//min/max
	template<class T>
	using min = boost::compute::fmin<T>;
	template<class T>
	using max = boost::compute::fmax<T>;
	
	//comparison
	template<class T>
	using less = boost::compute::less<T>;
	template<class T>
	using less_equal  = boost::compute::less_equal<T>;
	template<class T>
	using greater = boost::compute::greater<T>;
	template<class T>
	using greater_equal  = boost::compute::greater_equal<T>;
	template<class T>
	using equal = boost::compute::equal_to<T>;
	template<class T>
	using not_equal  = boost::compute::not_equal_to<T>;
};

namespace gpu{namespace detail{
	
    
struct meta_kernel;

template<class Entity>
struct register_with_compute_kernel{
	typedef Entity type;
	static type const& reg(meta_kernel&, Entity const& e){
		return e;
	}
};

struct meta_kernel: public boost::compute::detail::meta_kernel{
	meta_kernel(std::string const& name):boost::compute::detail::meta_kernel(name), m_id(0){}
	
    template<class T>
    std::string register_kernel_arg(T const& value){
        ++m_id;
        std::string name = "rem_var"+std::to_string(m_id);
        this->add_set_arg<T>(name,value);
        return name;
    }
    
	template<class Entity>
	typename register_with_compute_kernel<Entity>::type
	register_args(Entity const& e){
		return register_with_compute_kernel<Entity>::reg(*this,e);
	}
private:
	std::size_t m_id;
};

template<class F, class Arg2>
struct register_with_compute_kernel<device_traits<gpu_tag>::template bind_second<F,Arg2> >{
	typedef typename register_with_compute_kernel<F>::type f_type;
	typedef device_traits<gpu_tag>::template bind_second<f_type,std::string> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::template bind_second<F,Arg2> const& f
	){
		std::string arg2_name = k.register_kernel_arg(f.m_arg2);
		return type(register_with_compute_kernel<F>::reg(k,f.m_function),arg2_name);
	}
};

template<class F, class G>
struct register_with_compute_kernel<device_traits<gpu_tag>::template compose<F, G> >{
	typedef typename register_with_compute_kernel<F>::type f_type;
	typedef typename register_with_compute_kernel<G>::type g_type;
	typedef typename device_traits<gpu_tag>::template compose<f_type, g_type> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::compose<F, G> const& composed
	){
		auto f_reg = register_with_compute_kernel<F>::reg(k,composed.m_f);
		auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
		return type(f_reg, g_reg);
	}
};

template<class F1, class F2, class G>
struct register_with_compute_kernel<device_traits<gpu_tag>::template compose_binary<F1, F2, G> >{
	typedef typename register_with_compute_kernel<F1>::type f1_type;
	typedef typename register_with_compute_kernel<F2>::type f2_type;
	typedef typename register_with_compute_kernel<G>::type g_type;
	typedef typename device_traits<gpu_tag>::template compose_binary<f1_type, f2_type, g_type> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::compose_binary<F1, F2, G> const& composed
	){
		auto f1_reg = register_with_compute_kernel<F1>::reg(k,composed.m_f1);
		auto f2_reg = register_with_compute_kernel<F2>::reg(k,composed.m_f2);
		auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
		return type(f1_reg, f2_reg, g_reg);
	}
};


template<class F1, class F2, class G>
struct register_with_compute_kernel<device_traits<gpu_tag>::template transform_arguments<F1, F2, G> >{
	typedef typename register_with_compute_kernel<F1>::type f1_type;
	typedef typename register_with_compute_kernel<F2>::type f2_type;
	typedef typename register_with_compute_kernel<G>::type g_type;
	typedef typename device_traits<gpu_tag>::template transform_arguments<f1_type, f2_type, g_type> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::transform_arguments<F1, F2, G> const& composed
	){
		auto f1_reg = register_with_compute_kernel<F1>::reg(k,composed.m_f1);
		auto f2_reg = register_with_compute_kernel<F2>::reg(k,composed.m_f2);
		auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
		return type(f1_reg, f2_reg, g_reg);
	}
};

template<class T>
struct register_with_compute_kernel<device_traits<gpu_tag>::template constant<T,T> >{
	typedef typename device_traits<gpu_tag>::template constant<T,std::string> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::constant<T,T> const& f
	){
		return type(k.register_kernel_arg(f.m_value));
	}
};

template<class T>
struct register_with_compute_kernel<device_traits<gpu_tag>::template safe_divide<T,T> >{
	typedef typename device_traits<gpu_tag>::template safe_divide<T,std::string> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::safe_divide<T,T> const& f
	){
		return type(k.register_kernel_arg(f.default_value));
	}
};

template<class T>
struct register_with_compute_kernel<device_traits<gpu_tag>::template multiply_and_add<T,T> >{
	typedef typename device_traits<gpu_tag>::template multiply_and_add<T,std::string> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::multiply_and_add<T,T> const& f
	){
		return type(k.register_kernel_arg(f.m_scalar));
	}
};

template<class T, char Op>
struct register_with_compute_kernel<device_traits<gpu_tag>::template operator_scalar<T, Op, T> >{
	typedef typename device_traits<gpu_tag>::template operator_scalar<T, Op, std::string> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::operator_scalar<T, Op, T> const& f
	){
		return type(k.register_kernel_arg(f.m_scalar));
	}
};

template<class T>
struct register_with_compute_kernel<device_traits<gpu_tag>::template multiply_assign<T,T> >{
	typedef typename device_traits<gpu_tag>::template multiply_assign<T,std::string> type;
	static type reg(
		meta_kernel& k,
		device_traits<gpu_tag>::multiply_assign<T,T> const& f
	){
		return type(k.register_kernel_arg(f.m_scalar));
	}
};


//vector element
template<class Arg, class T, class S>
struct invoked_dense_vector_element{
	typedef T result_type;
	Arg arg;
	S stride;
	S offset;
	boost::compute::buffer buffer;
};

template<class Arg,class T, class S>
boost::compute::detail::meta_kernel& operator<< (
	boost::compute::detail::meta_kernel& k, 
	invoked_dense_vector_element<Arg, T, S> const& e
){
	return k<< k.get_buffer_identifier<T>(e.buffer, boost::compute::memory_object::global_memory)
		<<" [ "<<e.offset <<"+("<<e.arg <<") *"<<e.stride<<']';
}

template<class T, class S=std::size_t>
struct dense_vector_element{
	typedef T result_type;
	
	template<class Arg>
	gpu::detail::invoked_dense_vector_element<Arg,T, S> operator()(Arg const& x) const{
		return {x, m_stride, m_offset, m_buffer};
	}
	boost::compute::buffer m_buffer;
	S m_stride;
	S m_offset;
};

template<class T>
struct register_with_compute_kernel<dense_vector_element<T,std::size_t> >{
	typedef dense_vector_element<T,std::string> type;
	static type reg(
		meta_kernel& k,
		dense_vector_element<T,std::size_t> const& e
	){
		return {e.m_buffer, k.register_kernel_arg(e.m_stride),k.register_kernel_arg(e.m_offset)};
	}
};

//matrix element
template<class Arg1, class Arg2,  class T, class S>
struct invoked_matrix_element{
	typedef T result_type;
	Arg1 arg1;
	Arg2 arg2;
	S stride1;
	S stride2;
	S offset;
	boost::compute::buffer buffer;
};


template<class Arg1, class Arg2, class T, class S>
boost::compute::detail::meta_kernel& operator<< (
	boost::compute::detail::meta_kernel& k, 
	invoked_matrix_element<Arg1, Arg2, T, S> const& e
){
	return k << k.get_buffer_identifier<T>(e.buffer, boost::compute::memory_object::global_memory)
				 <<'['<<e.offset<<"+ ("<<e.arg1 <<") * "<<e.stride1<<" + ("<<e.arg2 <<") * "<<e.stride2<<']';
}

template<class T, class S=std::size_t>
struct dense_matrix_element{
	typedef T result_type;

	template<class Arg1, class Arg2>
	gpu::detail::invoked_matrix_element<Arg1, Arg2, T, S> operator()(Arg1 const& x, Arg2 const& y) const{
		return {x, y, m_stride1, m_stride2, m_offset, m_buffer};
	}
	
	boost::compute::buffer m_buffer;
	S m_stride1;
	S m_stride2;
	S m_offset;
};

template<class T>
struct register_with_compute_kernel<dense_matrix_element<T,std::size_t> >{
	typedef dense_matrix_element<T,std::string> type;
	static type reg(
		meta_kernel& k,
		dense_matrix_element<T,std::size_t> const& e
	){
		auto const& stride1 = k.register_kernel_arg(e.m_stride1); 
		auto const& stride2 = k.register_kernel_arg(e.m_stride2); 
		auto const& offset = k.register_kernel_arg(e.m_offset); 
		return {e.m_buffer, stride1, stride2, offset};
	}
};

}}

}

#endif